IMPORTS

In [1]:
#!pip install pystan fbprophet
#!pip install pmdarima
In [2]:
import pandas as pd
from fbprophet import Prophet
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

EDA

In [3]:
# import data & convert datetime column
df = pd.read_csv("Historic_GRPs_2021-09-15.csv")
df['date'] = pd.to_datetime(df['Date'])
del df['Date']
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
In [4]:
df.head()
Out[4]:
AMSTEL BEER ANGRY ORCHARD BEVERAGES-ALCOHOLIC HARD CIDER ANHEUSER-BUSCH BEER ATHLETIC BREWING BEER BALLAST POINT BREWING CO BEER BEST DAMN SODA BLUE MOON BEER BON VIV SPIKED SELTZER BUD LIGHT BEER BUD LIGHT BEER CHELADA ... TRAVELER BEER TRULY SPIKED AND SPARKLING ALCOHOLIC BEVERAGE TWISTED TEA BEVERAGES-ALCOHOLIC MALT LIQUOR VICTORIA BEER VIZZY HARD SELTZER WHITE CLAW HARD SELTZER WHITE CLAW HARD SELTZER ICED TEA date year month
0 NaN 106.12 NaN NaN NaN NaN 0.81 NaN 323.45 NaN ... NaN NaN NaN NaN NaN NaN NaN 2015-01-01 2015 1
1 NaN 52.81 0.42 NaN NaN NaN 1.10 NaN 118.25 NaN ... NaN NaN 6.92 NaN NaN NaN NaN 2015-02-01 2015 2
2 NaN 57.90 NaN NaN NaN NaN 125.26 NaN 368.93 NaN ... 29.28 NaN 7.06 NaN NaN NaN NaN 2015-03-01 2015 3
3 NaN 44.74 0.90 NaN NaN NaN 137.76 NaN 266.26 NaN ... 61.40 NaN 80.92 NaN NaN NaN NaN 2015-04-01 2015 4
4 NaN 68.78 21.02 NaN NaN NaN 92.34 NaN 806.58 NaN ... 95.76 NaN 72.77 NaN NaN NaN NaN 2015-05-01 2015 5

5 rows × 104 columns

Data Split

In [5]:
co = df.loc[:, df.columns.isin(
    ['date','year', 'AMSTEL BEER','ANGRY ORCHARD BEVERAGES-ALCOHOLIC HARD CIDER', 'ATHLETIC BREWING BEER', 
     'BALLAST POINT BREWING CO BEER', 'BLUE MOON BEER', 'CAPE LINE BEVERAGES-ALCOHOLIC',
     'CERVEZA SOL', 'COORS BREWING BEER', 'COORS HARD SELTZER', 'COORS LIGHT BEER', 
     'CORONA EXTRA BEER', 'CORONA HARD SELTZER', 'CORONA LIGHT BEER','CORONA PREMIER BEER', 'CORONA REFRESCA',
     'CRISPIN BEVERAGES-ALCOHOLIC', 'DESCHUTES BREWERY BEER', 
     'DOS EQUIS BEER', 'FIRESTONE WALKER BEER', 'FOUR LOCO BEVERAGES-ALCOHOLIC', 'FOUR LOCO CAFFEINATED BEVERAGES-ALCOHOLIC', 
     'GOLD BUCKLE BEER','GUINESS COLD BREW', 'GUINNESS BEER', 'GUINNESS BLONDE AMERICAN LAGER BEER', 'GUINNESS NITRO IPA', 
     'HEINEKEN BEER', 'HEINEKEN LIGHT BEER', 
     'HENRYS HARD SODA', 'HIGH NOON SUN SIPS', 'LEINENKUGELS BEER', 'MICKEY\'S FINE MALT LIQUOR', 
     'MIKES BEVERAGES-ALCOHOLIC MALT COCKTAIL', 'MIKES HARD LEMONADE', 'MIKES HARD LEMONADE SELTZER', 
     'MILLER BREWING BEER', 'MILLER HIGH LIFE BEER', 'MILLER LITE BEER', 
     'MODELO ESPECIAL BEER',  'MODELO ESPECIAL CHELADA BEER', 'PABST BEER', 
     'PALM BREEZE BEVERAGES-ALCOHOLIC', 'PERONI BEER','SAINT ARCHER BREWING', 'SAMUEL ADAMS BEER', 
     'SAUCY BREW WORKS BEER', 'SEAGRAMS ESCAPES BEVERAGES-ALCOHOLIC', 'SHINER BEER', 'SHIPYARD BEER ISLAND TIME', 
     'SIERRA NEVADA BEER', 'SMIRNOFF HARD SELTZER', 'SMIRNOFF ICE BEVERAGES-ALCOHOLIC', 'SMITH & FORGE BEVERAGES-ALCOHOLIC HARD CIDER', 
     'STRONGBOW BEVERAGES-ALCOHOLIC HARD CIDER', 'TECATE BEER', 'TECATE LIGHT BEER', 'TRAVELER BEER', 
     'TRULY SPIKED AND SPARKLING ALCOHOLIC BEVERAGE', 'TWISTED TEA BEVERAGES-ALCOHOLIC MALT LIQUOR', 'VIZZY HARD SELTZER',
     'WHITE CLAW HARD SELTZER', 'WHITE CLAW HARD SELTZER ICED TEA'])]

ab = df.loc[:, ~df.columns.isin(
    ['AMSTEL BEER','ANGRY ORCHARD BEVERAGES-ALCOHOLIC HARD CIDER', 'ATHLETIC BREWING BEER', 
     'BALLAST POINT BREWING CO BEER', 'BLUE MOON BEER', 'CAPE LINE BEVERAGES-ALCOHOLIC',
     'CERVEZA SOL', 'COORS BREWING BEER', 'COORS HARD SELTZER', 'COORS LIGHT BEER', 
     'CORONA EXTRA BEER', 'CORONA HARD SELTZER', 'CORONA LIGHT BEER','CORONA PREMIER BEER', 'CORONA REFRESCA',
     'CRISPIN BEVERAGES-ALCOHOLIC', 'DESCHUTES BREWERY BEER', 
     'DOS EQUIS BEER', 'FIRESTONE WALKER BEER', 'FOUR LOCO BEVERAGES-ALCOHOLIC', 'FOUR LOCO CAFFEINATED BEVERAGES-ALCOHOLIC', 
     'GOLD BUCKLE BEER','GUINESS COLD BREW', 'GUINNESS BEER', 'GUINNESS BLONDE AMERICAN LAGER BEER', 'GUINNESS NITRO IPA', 
     'HEINEKEN BEER', 'HEINEKEN LIGHT BEER', 
     'HENRYS HARD SODA', 'HIGH NOON SUN SIPS', 'LEINENKUGELS BEER', 'MICKEY\'S FINE MALT LIQUOR', 
     'MIKES BEVERAGES-ALCOHOLIC MALT COCKTAIL', 'MIKES HARD LEMONADE', 'MIKES HARD LEMONADE SELTZER', 
     'MILLER BREWING BEER', 'MILLER HIGH LIFE BEER', 'MILLER LITE BEER', 
     'MODELO ESPECIAL BEER',  'MODELO ESPECIAL CHELADA BEER', 'PABST BEER', 
     'PALM BREEZE BEVERAGES-ALCOHOLIC', 'PERONI BEER','SAINT ARCHER BREWING', 'SAMUEL ADAMS BEER', 
     'SAUCY BREW WORKS BEER', 'SEAGRAMS ESCAPES BEVERAGES-ALCOHOLIC', 'SHINER BEER', 'SHIPYARD BEER ISLAND TIME', 
     'SIERRA NEVADA BEER', 'SMIRNOFF HARD SELTZER', 'SMIRNOFF ICE BEVERAGES-ALCOHOLIC', 'SMITH & FORGE BEVERAGES-ALCOHOLIC HARD CIDER', 
     'STRONGBOW BEVERAGES-ALCOHOLIC HARD CIDER', 'TECATE BEER', 'TECATE LIGHT BEER', 'TRAVELER BEER', 
     'TRULY SPIKED AND SPARKLING ALCOHOLIC BEVERAGE', 'TWISTED TEA BEVERAGES-ALCOHOLIC MALT LIQUOR', 'VIZZY HARD SELTZER',
     'WHITE CLAW HARD SELTZER', 'WHITE CLAW HARD SELTZER ICED TEA'])]
In [6]:
print(f"Number of rows: {co.shape[0]}\nNumber of columns: {len(list(co.columns))}\nFile size in bytes: {co.size}")
Number of rows: 79
Number of columns: 64
File size in bytes: 5056

Descriptives

In [7]:
# grab summary statistics for entire data frame (can be subset)
co.describe()
Out[7]:
AMSTEL BEER ANGRY ORCHARD BEVERAGES-ALCOHOLIC HARD CIDER ATHLETIC BREWING BEER BALLAST POINT BREWING CO BEER BLUE MOON BEER CAPE LINE BEVERAGES-ALCOHOLIC CERVEZA SOL COORS BREWING BEER COORS HARD SELTZER COORS LIGHT BEER ... STRONGBOW BEVERAGES-ALCOHOLIC HARD CIDER TECATE BEER TECATE LIGHT BEER TRAVELER BEER TRULY SPIKED AND SPARKLING ALCOHOLIC BEVERAGE TWISTED TEA BEVERAGES-ALCOHOLIC MALT LIQUOR VIZZY HARD SELTZER WHITE CLAW HARD SELTZER WHITE CLAW HARD SELTZER ICED TEA year
count 2.000000 60.000000 1.00 1.00 75.000000 6.000000 11.000000 47.000000 6.000000 79.000000 ... 35.000000 65.000000 32.00000 10.000000 42.000000 43.000000 9.000000 27.000000 2.000000 79.000000
mean 2.415000 72.432500 0.97 16.82 109.048933 86.915000 174.909091 64.373404 41.165000 225.396962 ... 89.467143 40.627538 17.22750 54.892000 94.355238 61.887442 54.395556 81.151852 20.045000 2017.810127
std 1.718269 42.901036 NaN NaN 78.318894 45.411119 131.292674 52.116520 28.887793 103.148155 ... 52.610696 42.565841 25.54243 33.589993 58.409631 37.710284 32.828073 84.844941 12.537003 1.922045
min 1.200000 8.390000 0.97 16.82 0.040000 13.200000 6.140000 0.000000 5.850000 50.290000 ... 0.110000 0.000000 0.01000 9.810000 0.450000 0.420000 7.610000 0.010000 11.180000 2015.000000
25% 1.807500 46.180000 0.97 16.82 49.670000 77.997500 77.965000 19.365000 20.710000 131.345000 ... 63.065000 6.830000 1.37250 30.035000 59.320000 40.375000 25.960000 4.530000 15.612500 2016.000000
50% 2.415000 61.110000 0.97 16.82 105.610000 88.725000 172.790000 53.040000 39.165000 223.290000 ... 88.930000 17.320000 6.57500 50.750000 84.930000 58.270000 61.740000 60.140000 20.045000 2018.000000
75% 3.022500 92.282500 0.97 16.82 147.270000 99.415000 214.350000 103.910000 60.275000 303.435000 ... 109.155000 75.720000 15.91250 85.885000 136.257500 85.900000 66.730000 145.070000 24.477500 2019.000000
max 3.630000 272.390000 0.97 16.82 339.870000 153.440000 466.300000 177.730000 80.940000 463.420000 ... 234.000000 145.330000 103.53000 96.640000 227.820000 151.840000 97.530000 249.220000 28.910000 2021.000000

8 rows × 63 columns

Barplot

In [8]:
# create long version dataframe
df_stacked = co.copy()
del df_stacked['date']

frames = []
for col in list(df_stacked.columns):
    col_values = list(df[col])
    col_names = [col] * len(col_values)
    
    frames.append(pd.DataFrame(list(zip(col_values,col_names)),columns=['GRP','Brand']))
    
df_long = pd.concat(frames)
In [9]:
# https://plotly.com/python/box-plots/
fig = px.box(
    df_long, 
    x="Brand", 
    y="GRP", 
    width=1200, 
    height=400,     
    title="GRP Spend by Beverage Brand")

fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig.show()
#fig.show(renderer='browser')

Exclude brands with unspecified beer_type

In [10]:
comp = co.loc[:, ~co.columns.isin(
    ['COORS BREWING BEER',
 'DESCHUTES BREWERY BEER',
 'DOS EQUIS BEER',
 'FIRESTONE WALKER BEER',
 'GUINESS COLD BREW',
 'GUINNESS BEER',
 'HIGH NOON SUN SIPS',
 'KONA BREWING CO BEER',
 'LAGUNITAS BREWING CO BEER',
 'LEINENKUGELS BEER',
 'MILLER BREWING BEER',
 'MODELO ESPECIAL CHELADA BEER',
 'MONTEJO BEER',
 'OLD BLUE LAST BEER',
 'RITAS FAMILY',
 'SAINT ARCHER BREWING'])]
In [11]:
feature_frames = []

for i in range(comp.shape[0]):
    for col in list(comp.columns): 
        for year in list(comp['year'].unique()):
            comp_sub = comp[comp['year'] == year] 
        
            try:
                internal_prop = comp_sub[col][i] / comp_sub[col].sum() 
                external_prop = comp_sub[col][i] / comp_sub.sum(axis=1)[i] 
                grp = comp_sub[col][i]
                annual_prop = comp[col].sum() / comp_sub[col].sum()
                z_score = (comp_sub[col][i] / comp_sub[col].mean()) / comp_sub[col].std()
                mu = comp_sub[col].mean()
                std = comp_sub[col].std()
                count = len([i for i in list(comp_sub[col]) if i > 0])
                

                features = [col,internal_prop,external_prop, grp, z_score, mu, std, count, str(comp_sub['date'][i])]

                feature_frames.append(features)

            except:
                pass
            
#Create the pandas DataFrame
cluster_df = pd.DataFrame(
    feature_frames, 
    columns = ['brand','internal_prop','external_prop','grp', 'z_score', 'mu', 'std', 'count', 'date']).fillna(0)
C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\ipykernel_launcher.py:10: FutureWarning:

Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.

C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\ipykernel_launcher.py:12: RuntimeWarning:

divide by zero encountered in double_scalars

C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\ipykernel_launcher.py:12: RuntimeWarning:

invalid value encountered in double_scalars

C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\ipykernel_launcher.py:13: RuntimeWarning:

divide by zero encountered in double_scalars

C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\ipykernel_launcher.py:9: RuntimeWarning:

invalid value encountered in double_scalars

C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\ipykernel_launcher.py:13: RuntimeWarning:

invalid value encountered in double_scalars

In [12]:
cluster_df['year'] = cluster_df['date'].str[0:4]
cluster_df['month'] = cluster_df['date'].str[5:7]

seltzer=['BON VIV SPIKED SELTZER','CACTI AGAVE SPIKED SELTZER','COORS HARD SELTZER','CORONA HARD SELTZER',
         'MIKES HARD LEMONADE SELTZER', 'SMIRNOFF HARD SELTZER','VIZZY HARD SELTZER','WHITE CLAW HARD SELTZER',
         'WHITE CLAW HARD SELTZER ICED TEA', 'TRULY SPIKED AND SPARKLING ALCOHOLIC BEVERAGE', 'CAPE LINE BEVERAGES-ALCOHOLIC']

malt = ["MICKEY'S FINE MALT LIQUOR", 'CORONA REFRESCA', 'FOUR LOCO BEVERAGES-ALCOHOLIC', 'FOUR LOCO CAFFEINATED BEVERAGES-ALCOHOLIC',
'GUINNESS NITRO IPA', 'HENRYS HARD SODA', 'MIKES BEVERAGES-ALCOHOLIC MALT COCKTAIL', 'MIKES HARD LEMONADE',
'PALM BREEZE BEVERAGES-ALCOHOLIC', 'SMIRNOFF ICE BEVERAGES-ALCOHOLIC', 'TWISTED TEA BEVERAGES-ALCOHOLIC MALT LIQUOR']

ale = ['ATHLETIC BREWING BEER', 'BALLAST POINT BREWING CO BEER', 'BLUE MOON BEER', 'SAUCY BREW WORKS BEER',
'SHIPYARD BEER ISLAND TIME', 'SIERRA NEVADA BEER', 'TRAVELER BEER']

hard_cider = ['ANGRY ORCHARD BEVERAGES-ALCOHOLIC HARD CIDER', 'CRISPIN BEVERAGES-ALCOHOLIC',
'SMITH & FORGE BEVERAGES-ALCOHOLIC HARD CIDER', 'STRONGBOW BEVERAGES-ALCOHOLIC HARD CIDER']

lager = ['AMSTEL BEER', 'CERVEZA SOL', 'CORONA EXTRA BEER', 'CORONA LIGHT BEER', 'GOLD BUCKLE BEER','GUINNESS BLONDE AMERICAN LAGER BEER',
'HEINEKEN BEER', 'MILLER HIGH LIFE BEER', 'MODELO ESPECIAL BEER', 'PABST BEER', 'PERONI BEER', 'SAMUEL ADAMS BEER',
'SHINER BEER', 'TECATE BEER']

light_beer = ['COORS LIGHT BEER', 'CORONA PREMIER BEER', 'HEINEKEN LIGHT BEER', 'MILLER LITE BEER', 'SEAGRAMS ESCAPES BEVERAGES-ALCOHOLIC',
'TECATE LIGHT BEER']
In [13]:
a = cluster_df['brand'].to_list()
In [14]:
# classify beer_type
beer_type = []
for i in a:
    if i in seltzer:
        beer_type.append('seltzer')
    elif i in malt:
        beer_type.append('malt')
    elif i in ale:
        beer_type.append('ale')
    elif i in hard_cider:
        beer_type.append('hard_cider')
    elif i in lager:
        beer_type.append('lager')
    elif i in light_beer:
        beer_type.append('light_beer')
    else:
        beer_type.append('beer')
        
In [15]:
# create beer_type column
cluster_df['beer_type'] = beer_type
In [16]:
Boston_Beer_Company =['ANGRY ORCHARD BEVERAGES-ALCOHOLIC HARD CIDER', 'SAMUEL ADAMS BEER', 'TRAVELER BEER',
'TRULY SPIKED AND SPARKLING ALCOHOLIC BEVERAGE', 'TWISTED TEA BEVERAGES-ALCOHOLIC MALT LIQUOR']

Constellation_Brands = ['CORONA EXTRA BEER', 'CORONA HARD SELTZER', 'CORONA LIGHT BEER', 'CORONA PREMIER BEER',
'CORONA REFRESCA', 'MODELO ESPECIAL BEER', 'MODELO ESPECIAL CHELADA BEER']

Diageo = ['GUINESS COLD BREW', 'GUINNESS BEER', 'GUINNESS BLONDE AMERICAN LAGER BEER',
'GUINNESS NITRO IPA', 'SEAGRAMS ESCAPES BEVERAGES-ALCOHOLIC']

Heineken = ['AMSTEL BEER', 'DOS EQUIS BEER', 'HEINEKEN BEER', 'HEINEKEN LIGHT BEER', 'SMIRNOFF HARD SELTZER',
'SMIRNOFF ICE BEVERAGES-ALCOHOLIC', 'STRONGBOW BEVERAGES-ALCOHOLIC HARD CIDER', 'TECATE BEER', 'TECATE LIGHT BEER']

Mark_Anthony_Group = ['MIKES BEVERAGES-ALCOHOLIC MALT COCKTAIL', 'MIKES HARD LEMONADE SELTZER', 'MIKES HARD LEMONADE', 'PALM BREEZE BEVERAGES-ALCOHOLIC',
         'WHITE CLAW HARD SELTZER ICED TEA', 'WHITE CLAW HARD SELTZER']


Molson_Coors = ["MICKEY'S FINE MALT LIQUOR", 'BLUE MOON BEER', 'CAPE LINE BEVERAGES-ALCOHOLIC',
'CERVEZA SOL', 'COORS BREWING BEER', 'COORS HARD SELTZER', 'COORS LIGHT BEER', 'CRISPIN BEVERAGES-ALCOHOLIC',
'LEINENKUGELS BEER', 'MILLER BREWING BEER', 'MILLER HIGH LIFE BEER', 'MILLER LITE BEER', 'PERONI BEER', 'SAINT ARCHER BREWING',
'SMITH & FORGE BEVERAGES-ALCOHOLIC HARD CIDER', 'VIZZY HARD SELTZER']

Phusion_Projects = ['FOUR LOCO BEVERAGES-ALCOHOLIC', 'FOUR LOCO CAFFEINATED BEVERAGES-ALCOHOLIC']

Classify brands by brewery

In [17]:
# classify brewery
brewery = []
for i in a:
    if i in Boston_Beer_Company:
        brewery.append('Boston Beer Company')
    elif i in Constellation_Brands:
        brewery.append('Constellation Brands')
    elif i in Diageo:
        brewery.append('Diageo')
    elif i in Heineken:
        brewery.append('Heineken')
    elif i in Mark_Anthony_Group:
        brewery.append('Mark Anthony Group')
    elif i in Molson_Coors:
        brewery.append('Molson Coors')
    elif i in Phusion_Projects:
        brewery.append('Phusion Projects')
    else:
        brewery.append('others')
        
In [18]:
# create brewery column
cluster_df['brewery'] = brewery
In [19]:
cluster_df = cluster_df.loc[(cluster_df["brand"] != 'year') | (cluster_df["brand"] != 'month')]

Create DataFrame for SoV

In [20]:
pd.set_option('display.max_rows', 10)
final = cluster_df[cluster_df['beer_type'] != 'beer']
final.head()
Out[20]:
brand internal_prop external_prop grp z_score mu std count date year month beer_type brewery
0 AMSTEL BEER 0.000000 0.000000 0.00 0.000000 0.000000 0.000000 0 2015-01-01 00:00:00 2015 01 lager Heineken
1 ANGRY ORCHARD BEVERAGES-ALCOHOLIC HARD CIDER 0.141486 0.031719 106.12 0.087126 62.503333 19.487036 12 2015-01-01 00:00:00 2015 01 hard_cider Boston Beer Company
2 ATHLETIC BREWING BEER 0.000000 0.000000 0.00 0.000000 0.000000 0.000000 0 2015-01-01 00:00:00 2015 01 ale others
3 BALLAST POINT BREWING CO BEER 0.000000 0.000000 0.00 0.000000 0.000000 0.000000 0 2015-01-01 00:00:00 2015 01 ale others
4 BLUE MOON BEER 0.000699 0.000242 0.81 0.000176 96.595833 47.615934 12 2015-01-01 00:00:00 2015 01 ale Molson Coors

Beer_type GRP Analysis

In [21]:
final1 = final[['brand', 'grp', 'date', 'beer_type']]
c = final1.groupby(['beer_type','date']).sum()
c = c.reset_index()
c.head()
Out[21]:
beer_type date grp
0 ale 2015-01-01 00:00:00 0.81
1 ale 2015-02-01 00:00:00 1.10
2 ale 2015-03-01 00:00:00 154.54
3 ale 2015-04-01 00:00:00 199.16
4 ale 2015-05-01 00:00:00 188.10
In [22]:
d = c.groupby('date').sum()
d = d.rename(columns={"grp": "grp_sum"})
d.head()
Out[22]:
grp_sum
date
2015-01-01 00:00:00 1330.66
2015-02-01 00:00:00 716.92
2015-03-01 00:00:00 1566.07
2015-04-01 00:00:00 1735.61
2015-05-01 00:00:00 2297.72
In [23]:
# join c and d table
e = pd.merge(c,d, on='date', how='left')
e.head()
Out[23]:
beer_type date grp grp_sum
0 ale 2015-01-01 00:00:00 0.81 1330.66
1 ale 2015-02-01 00:00:00 1.10 716.92
2 ale 2015-03-01 00:00:00 154.54 1566.07
3 ale 2015-04-01 00:00:00 199.16 1735.61
4 ale 2015-05-01 00:00:00 188.10 2297.72
In [24]:
# create a percentage column
e['percentage'] = e['grp'] * 100 / e['grp_sum']
e.head()
Out[24]:
beer_type date grp grp_sum percentage
0 ale 2015-01-01 00:00:00 0.81 1330.66 0.060872
1 ale 2015-02-01 00:00:00 1.10 716.92 0.153434
2 ale 2015-03-01 00:00:00 154.54 1566.07 9.868014
3 ale 2015-04-01 00:00:00 199.16 1735.61 11.474928
4 ale 2015-05-01 00:00:00 188.10 2297.72 8.186376

GRP by Quarters

In [25]:
# Extract brand, grp, date, year, month, beer_type, brewery columns from final dataframe
f = final[['brand', 'grp', 'date','year', 'month', 'beer_type', 'brewery']]
f.head()
Out[25]:
brand grp date year month beer_type brewery
0 AMSTEL BEER 0.00 2015-01-01 00:00:00 2015 01 lager Heineken
1 ANGRY ORCHARD BEVERAGES-ALCOHOLIC HARD CIDER 106.12 2015-01-01 00:00:00 2015 01 hard_cider Boston Beer Company
2 ATHLETIC BREWING BEER 0.00 2015-01-01 00:00:00 2015 01 ale others
3 BALLAST POINT BREWING CO BEER 0.00 2015-01-01 00:00:00 2015 01 ale others
4 BLUE MOON BEER 0.81 2015-01-01 00:00:00 2015 01 ale Molson Coors
In [26]:
# Create quarter list using month and year column
quarter = []
for year in list(f['year'].unique()):
    sub = f[f['year'] == year]
    for i in sub['month']:
        if int(i) < 4:
            quarter.append(year + 'Q1')
        elif (int(i) > 3) & (int(i) < 7):
            quarter.append(year + 'Q2')
        elif (int(i) > 6) & (int(i) < 10):
            quarter.append(year + 'Q3')
        else:
            quarter.append(year + 'Q4')
In [27]:
len(quarter)
Out[27]:
4029
In [28]:
import datetime as dt
import pandas as pd

f['quarter'] =  quarter
f.head()
C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[28]:
brand grp date year month beer_type brewery quarter
0 AMSTEL BEER 0.00 2015-01-01 00:00:00 2015 01 lager Heineken 2015Q1
1 ANGRY ORCHARD BEVERAGES-ALCOHOLIC HARD CIDER 106.12 2015-01-01 00:00:00 2015 01 hard_cider Boston Beer Company 2015Q1
2 ATHLETIC BREWING BEER 0.00 2015-01-01 00:00:00 2015 01 ale others 2015Q1
3 BALLAST POINT BREWING CO BEER 0.00 2015-01-01 00:00:00 2015 01 ale others 2015Q1
4 BLUE MOON BEER 0.81 2015-01-01 00:00:00 2015 01 ale Molson Coors 2015Q1

Calculate Quarterly SOV of GRP by beer_type

In [29]:
g = f.groupby(['quarter', 'beer_type']).sum()
g = g.reset_index()
g.head()
Out[29]:
quarter beer_type grp
0 2015Q1 ale 156.45
1 2015Q1 hard_cider 807.82
2 2015Q1 lager 1289.15
3 2015Q1 light_beer 1346.25
4 2015Q1 malt 13.98
In [30]:
n = f.groupby(['quarter']).sum()
n = n.reset_index()
n.head()
Out[30]:
quarter grp
0 2015Q1 3613.65
1 2015Q2 6311.66
2 2015Q3 6170.59
3 2015Q4 4351.40
4 2016Q1 3808.56
In [31]:
p = pd.merge(n,g, on='quarter', how='left')
p.head()
Out[31]:
quarter grp_x beer_type grp_y
0 2015Q1 3613.65 ale 156.45
1 2015Q1 3613.65 hard_cider 807.82
2 2015Q1 3613.65 lager 1289.15
3 2015Q1 3613.65 light_beer 1346.25
4 2015Q1 3613.65 malt 13.98
In [32]:
p['percentage'] = p['grp_y'] * 100 / p['grp_x']
p.head()
Out[32]:
quarter grp_x beer_type grp_y percentage
0 2015Q1 3613.65 ale 156.45 4.329418
1 2015Q1 3613.65 hard_cider 807.82 22.354683
2 2015Q1 3613.65 lager 1289.15 35.674457
3 2015Q1 3613.65 light_beer 1346.25 37.254576
4 2015Q1 3613.65 malt 13.98 0.386866
In [33]:
# Quarterly SOV by beer_type
import plotly.graph_objects as go
fig = px.bar(p, y='percentage', x='quarter',text='percentage', color='beer_type',
            color_discrete_map={
            'seltzer': 'red',
            'lager': 'royalblue',
            'malt': 'purple',
            'light_beer': 'mediumseagreen',
            'hard_cider': 'orange'})
fig.update_traces(texttemplate='%{text:.2s}', textposition='inside')
fig.show()
In [34]:
# Quarterly GRP_sum by beer_type
import plotly.graph_objects as go
fig = px.bar(g, y='grp', x='quarter', color='beer_type')
fig.show()
In [35]:
# Quarterly GRP_sum by seltzer
import plotly.graph_objects as go
fig = px.bar(g[g['beer_type'] == 'seltzer'], y='grp', x='quarter', color='beer_type')
fig.show()
In [36]:
# Quarterly GRP_sum by malt
import plotly.graph_objects as go
fig = px.bar(g[g['beer_type'] == 'malt'], y='grp', x='quarter', color='beer_type')
fig.show()
In [37]:
g = f.groupby(['quarter', 'brewery']).sum()
g = g.reset_index()
g.head()
Out[37]:
quarter brewery grp
0 2015Q1 Boston Beer Company 778.26
1 2015Q1 Constellation Brands 261.47
2 2015Q1 Diageo 0.00
3 2015Q1 Heineken 1099.65
4 2015Q1 Mark Anthony Group 0.00
In [38]:
l = f.groupby(['quarter']).sum()
l = l.reset_index()
l.head()
Out[38]:
quarter grp
0 2015Q1 3613.65
1 2015Q2 6311.66
2 2015Q3 6170.59
3 2015Q4 4351.40
4 2016Q1 3808.56
In [39]:
m = pd.merge(l,g, on='quarter', how='left')
m.head()
Out[39]:
quarter grp_x brewery grp_y
0 2015Q1 3613.65 Boston Beer Company 778.26
1 2015Q1 3613.65 Constellation Brands 261.47
2 2015Q1 3613.65 Diageo 0.00
3 2015Q1 3613.65 Heineken 1099.65
4 2015Q1 3613.65 Mark Anthony Group 0.00
In [40]:
m['percentage'] = m['grp_y'] * 100 / m['grp_x']
In [41]:
# Quarterly percentage by brewery
import plotly.graph_objects as go
fig = px.bar(m, y='percentage', x='quarter',text='percentage', color='brewery')
fig.update_traces(texttemplate='%{text:.2s}', textposition='inside')
fig.update_layout(title_text='Quarterly SOV trends')
fig.show()
In [42]:
# Quarterly grp_sum by brewery
import plotly.graph_objects as go
fig = px.bar(g, y='grp', x='quarter',text='grp', color='brewery')
fig.update_traces(texttemplate='%{text:.2s}', textposition='inside')
fig.show()

Yearly

In [43]:
g1 = f.groupby(['year', 'beer_type']).sum()
g1 = g1.reset_index()
In [44]:
# Yearly grp_sum by beer_type
import plotly.graph_objects as go
fig = px.bar(g1, y='grp', x='year', color='beer_type')
fig.show()
In [45]:
# Yearly grp_sum by seltzer
import plotly.graph_objects as go
fig = px.bar(g1[g1['beer_type'] == 'seltzer'], y='grp', x='year', color='beer_type')
fig.show()
In [46]:
# Yearly grp_sum by malt
import plotly.graph_objects as go
fig = px.bar(g1[g1['beer_type'] == 'malt'], y='grp', x='year', color='beer_type')
fig.show()
In [47]:
g1 = f.groupby(['year', 'brewery']).sum()
g1 = g1.reset_index()
In [48]:
# Yearly grp_sum by brewery
import plotly.graph_objects as go
fig = px.bar(g1, y='grp', x='year', color='brewery')
fig.show()

Monthly SOV by beer_type

In [49]:
import plotly.graph_objects as go
fig = px.bar(e, y='percentage', x='date', color='beer_type',
                         color_discrete_map={
            'seltzer': 'red',
            'lager': 'royalblue',
            'malt': 'purple',
            'light_beer': 'mediumseagreen',
            'hard_cider': 'orange'})
fig.update_layout(title_text='SOV Trends')
fig.show()

Monthly SOV by bewery

In [50]:
cluster_df1 = cluster_df[['brand', 'grp', 'date', 'brewery', 'beer_type']]
f = cluster_df1.groupby(['brewery','date']).sum()
f = f.reset_index()
f = f[f['brewery'] != 'others']
In [51]:
g = f.groupby('date').sum()
g = g.rename(columns={"grp": "grp_sum"})
In [52]:
# pd.set_option('display.max_rows', 10)
h = pd.merge(f,g, on='date', how='left')
h['percentage'] = h['grp'] * 100 / h['grp_sum']
In [53]:
import plotly.graph_objects as go
fig = px.bar(h, y='percentage', x='date', text='percentage', color='brewery')
fig.update_traces(texttemplate='%{text:.2s}', textposition='inside')
fig.show()

Each Brewery by beer_type

In [54]:
cluster_df1 = cluster_df[['brand', 'grp', 'date', 'brewery', 'beer_type']]
i = cluster_df1.groupby(['beer_type', 'brewery','date']).sum()
i = i.reset_index()
i = i[i['brewery'] != 'others']
In [55]:
k = pd.merge(i,g, on='date', how='left')
In [56]:
# Constellation Brand
import plotly.graph_objects as go
fig = px.bar(cluster_df[cluster_df['brewery'] == 'Constellation Brands'], y='grp', x='date', color='beer_type',
            color_discrete_map={
            'seltzer': 'red',
            'lager': 'royalblue',
            'malt': 'purple',
            'light_beer': 'mediumseagreen'})
fig.update_layout(title_text='Shift in SOV (Constellation Brands)')
fig.show()
In [57]:
# Heineken
import plotly.graph_objects as go
fig = px.bar(cluster_df[cluster_df['brewery'] == 'Heineken'], y='grp', x='date', color='beer_type',
            color_discrete_map={
            'seltzer': 'red',
            'lager': 'royalblue',
            'malt': 'purple',
            'light_beer': 'mediumseagreen',
            'hard_cider': 'orange'})
fig.update_layout(title_text='Shift in SOV (Heineken)')
fig.show()
In [58]:
# Molson Coors
import plotly.graph_objects as go
fig = px.bar(cluster_df[cluster_df['brewery'] == 'Molson Coors'], y='grp', x='date', color='beer_type',
            color_discrete_map={
            'seltzer': 'red',
            'lager': 'royalblue',
            'malt': 'purple',
            'light_beer': 'mediumseagreen',
            'hard_cider': 'orange'})
fig.update_layout(title_text='Shift in SOV (Molson Coors)')
fig.show()
In [59]:
# Boston Beer Company
import plotly.graph_objects as go
fig = px.bar(cluster_df[cluster_df['brewery'] == 'Boston Beer Company'], y='grp', x='date', color='beer_type',
            color_discrete_map={
            'seltzer': 'red',
            'lager': 'royalblue',
            'malt': 'purple',
            'light_beer': 'mediumseagreen',
            'hard_cider': 'orange'})
fig.update_layout(title_text='Shift in SOV (Boston Beer Company)')
fig.show()
In [60]:
# Diageo
import plotly.graph_objects as go
fig = px.bar(cluster_df[cluster_df['brewery'] == 'Diageo'], y='grp', x='date', color='beer_type',
            color_discrete_map={
            'seltzer': 'red',
            'lager': 'royalblue',
            'malt': 'purple',
            'light_beer': 'mediumseagreen',
            'hard_cider': 'orange'})
fig.update_layout(title_text='Shift in SOV (Diageo)')
fig.show()

Brewery by brands

In [61]:
# Constellation Brands
import plotly.graph_objects as go
fig = px.bar(cluster_df[cluster_df['brewery'] == 'Constellation Brands'], y='grp', x='date', color='brand')
fig.show()
In [62]:
# Heineken
import plotly.graph_objects as go
fig = px.bar(cluster_df[cluster_df['brewery'] == 'Heineken'], y='grp', x='date', color='brand')
fig.show()
In [63]:
# Molson Coors
import plotly.graph_objects as go
fig = px.bar(cluster_df[cluster_df['brewery'] == 'Molson Coors'], y='grp', x='date', color='brand')
fig.show()
In [64]:
# Boston Beer Company
import plotly.graph_objects as go
fig = px.bar(cluster_df[cluster_df['brewery'] == 'Boston Beer Company'], y='grp', x='date', color='brand')
fig.show()
In [65]:
# Diageo'
import plotly.graph_objects as go
fig = px.bar(cluster_df[cluster_df['brewery'] == 'Diageo'], y='grp', x='date', color='beer_type')
fig.show()

Predictions

Top 5 well-known brands

In [66]:
#Replace NaN values with 0
c0= co.fillna(0)
In [67]:
#Index date column
c0['ds']=pd.DatetimeIndex(c0['date'])

Corona

In [68]:
#Corona
corona =c0[['date', 'CORONA EXTRA BEER']]
corona.columns = ['ds', 'y'] #Need to rename the columns

#Split the data
train=corona.iloc[:-12]#except the last 30 values as training set
test=corona.iloc[-12:] #last 30 values as testing values
print('train shape', train.shape)
print('test shape', test.shape)

# set the date into index
train.set_index('ds', inplace=True)
test.set_index('ds', inplace=True)

#Set the Index for 2 years

index_years = pd.date_range(train.index[-1], freq='MS', periods = 19) #MS= month start
index_years
train shape (67, 2)
test shape (12, 2)
Out[68]:
DatetimeIndex(['2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01',
               '2020-11-01', '2020-12-01', '2021-01-01', '2021-02-01',
               '2021-03-01', '2021-04-01', '2021-05-01', '2021-06-01',
               '2021-07-01', '2021-08-01', '2021-09-01', '2021-10-01',
               '2021-11-01', '2021-12-01', '2022-01-01'],
              dtype='datetime64[ns]', freq='MS')
In [69]:
from pmdarima import auto_arima
#autoarima function: try different combinations of orders and each model will assign a score AIC.
#lower the AIC is better
stepwise_fit = auto_arima(corona['y'], trace=True,
                          suppress_warnings=True)

#we can test different p,d,q values in future but it can take too long
Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=982.297, Time=0.33 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=1010.616, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=980.564, Time=0.05 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=985.233, Time=0.07 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=1122.163, Time=0.01 sec
 ARIMA(2,0,0)(0,0,0)[0] intercept   : AIC=979.896, Time=0.11 sec
 ARIMA(3,0,0)(0,0,0)[0] intercept   : AIC=980.602, Time=0.17 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=976.816, Time=0.24 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=980.787, Time=0.06 sec
 ARIMA(3,0,1)(0,0,0)[0] intercept   : AIC=982.689, Time=0.14 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=981.214, Time=0.10 sec
 ARIMA(3,0,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.37 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=996.904, Time=0.11 sec

Best model:  ARIMA(2,0,1)(0,0,0)[0] intercept
Total fit time: 2.279 seconds
In [70]:
#1: Running ARIMA with best model
from statsmodels.tsa.arima_model import ARIMA

model_arima = ARIMA(train, order=(2,0,1)) # Best model
model_arima_fit = model_arima.fit(disp=-1)

# Saving ARIMA predictions
fcast1 = model_arima_fit.forecast(19)[0]

# Passing the same index as the others
fcast1 = pd.Series(fcast1, index=index_years)
fcast1 = fcast1.rename("Arima") 

##################################################
#2: Running auto ARIMA 

import pmdarima as pm
auto_arima_model = pm.auto_arima(train, seasonal=False, m=19)

# Read more about setting m
# https://alkaline-ml.com/pmdarima/tips_and_tricks.html

# make your forecasts
fcast2 = auto_arima_model.predict(19) 
fcast2 = pd.Series(fcast2, index=index_years)
fcast2 = fcast2.rename("Auto Arima")

#################################################
#3: Running fbprophet
from pandas import to_datetime

#define the model
model = Prophet()
# fit the model
model.fit(corona)

# define the period for which we want a prediction
df_index_years = pd.DataFrame(index_years)
df_index_years.columns = ['ds']
df_index_years['ds']= to_datetime(df_index_years['ds'])

# use the model to make a forecast
fcast3 = model.predict(df_index_years)
fcast3 = pd.Series(fcast3['yhat'].values, index=index_years)
fcast3 = fcast3.rename("Prophet")
C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:162: ValueWarning:

No frequency information was provided, so inferred frequency MS will be used.

C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\pmdarima\arima\_validation.py:62: UserWarning:

m (19) set for non-seasonal fit. Setting to 0

INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
In [71]:
# Ploting the predictions
fig, ax = plt.subplots(figsize=(15,5))
chart = sns.lineplot(x='ds', y='y', data = corona)
chart.set_title('Corona')
fcast1.plot(ax=ax, color='red', marker="o", legend=True)
fcast2.plot(ax=ax, color='green', marker="o", legend=True)
fcast3.plot(ax=ax, color='orange', marker="o", legend=True)

test.plot(ax=ax, color='blue', marker="o", legend=True)
Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f73ff9ff08>

Modelo

In [72]:
modelo =c0[['date', 'MODELO ESPECIAL BEER']]
modelo.columns = ['ds', 'y'] #Need to rename the columns

#modelo =modelo.iloc[48:] #only ast 2 years or so
In [73]:
#Split the data
train_m=modelo.iloc[:-12]#from 2019-07-01 to 2020-07-01
test_m=modelo.iloc[-12:] #last 30 values as testing values

# set the date into index
train_m.set_index('ds', inplace=True)
test_m.set_index('ds', inplace=True)

#Set the Index for 2 years

index_m = pd.date_range(train_m.index[-1], freq='MS', periods = 19) #MS= month start
index_m
Out[73]:
DatetimeIndex(['2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01',
               '2020-11-01', '2020-12-01', '2021-01-01', '2021-02-01',
               '2021-03-01', '2021-04-01', '2021-05-01', '2021-06-01',
               '2021-07-01', '2021-08-01', '2021-09-01', '2021-10-01',
               '2021-11-01', '2021-12-01', '2022-01-01'],
              dtype='datetime64[ns]', freq='MS')
In [74]:
stepwise_fit = auto_arima(modelo['y'], trace=True,
                          suppress_warnings=True)
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=960.526, Time=0.18 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=991.711, Time=0.01 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=989.316, Time=0.04 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.07 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=989.869, Time=0.01 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.14 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=965.618, Time=0.12 sec
 ARIMA(3,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.18 sec
 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=959.680, Time=0.26 sec
 ARIMA(1,1,3)(0,0,0)[0] intercept   : AIC=inf, Time=0.23 sec
 ARIMA(3,1,3)(0,0,0)[0] intercept   : AIC=inf, Time=0.27 sec
 ARIMA(2,1,4)(0,0,0)[0] intercept   : AIC=inf, Time=0.27 sec
 ARIMA(1,1,4)(0,0,0)[0] intercept   : AIC=inf, Time=0.28 sec
 ARIMA(3,1,4)(0,0,0)[0] intercept   : AIC=inf, Time=0.38 sec
 ARIMA(2,1,3)(0,0,0)[0]             : AIC=957.786, Time=0.24 sec
 ARIMA(1,1,3)(0,0,0)[0]             : AIC=964.253, Time=0.11 sec
 ARIMA(2,1,2)(0,0,0)[0]             : AIC=958.698, Time=0.13 sec
 ARIMA(3,1,3)(0,0,0)[0]             : AIC=953.663, Time=0.21 sec
 ARIMA(3,1,2)(0,0,0)[0]             : AIC=961.212, Time=0.13 sec
 ARIMA(4,1,3)(0,0,0)[0]             : AIC=943.775, Time=0.31 sec
 ARIMA(4,1,2)(0,0,0)[0]             : AIC=955.418, Time=0.11 sec
 ARIMA(5,1,3)(0,0,0)[0]             : AIC=945.637, Time=0.32 sec
 ARIMA(4,1,4)(0,0,0)[0]             : AIC=inf, Time=0.32 sec
 ARIMA(3,1,4)(0,0,0)[0]             : AIC=inf, Time=0.24 sec
 ARIMA(5,1,2)(0,0,0)[0]             : AIC=inf, Time=0.32 sec
 ARIMA(5,1,4)(0,0,0)[0]             : AIC=inf, Time=0.34 sec
 ARIMA(4,1,3)(0,0,0)[0] intercept   : AIC=inf, Time=0.32 sec

Best model:  ARIMA(4,1,3)(0,0,0)[0]          
Total fit time: 5.572 seconds
In [75]:
####### Running ARIMA with best model
modelo_arima = ARIMA(train_m, order=(4,1,3)) # Best model
modelo_arima_fit = modelo_arima.fit(disp=-1)

# Saving ARIMA predictions
fcast1_m = modelo_arima_fit.forecast(19)[0]

# Passing the same index as the others
fcast1_m = pd.Series(fcast1_m, index=index_m)
fcast1_m = fcast1_m.rename("Arima") 

####### Auto ARIMA

# Running auto ARIMA 
auto_arima_modelo = pm.auto_arima(train_m, seasonal=False, m=19)

# make your forecasts
fcast2_m = auto_arima_modelo.predict(19) 
fcast2_m = pd.Series(fcast2_m, index=index_m)
fcast2_m = fcast2_m.rename("Auto Arima")


####### fbProphet

from pandas import to_datetime

# define the model
prophet_modelo = Prophet()
# fit the model
prophet_modelo.fit(modelo)

# define the period for which we want a prediction
df_m = pd.DataFrame(index_m)
df_m.columns = ['ds']
df_m['ds']= to_datetime(df_m['ds'])

# use the model to make a forecast
fcast3_m = prophet_modelo.predict(df_m)
fcast3_m = pd.Series(fcast3_m['yhat'].values, index=index_m)
fcast3_m = fcast3_m.rename("Prophet")
C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:162: ValueWarning:

No frequency information was provided, so inferred frequency MS will be used.

C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:162: ValueWarning:

No frequency information was provided, so inferred frequency MS will be used.

C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\pmdarima\arima\_validation.py:62: UserWarning:

m (19) set for non-seasonal fit. Setting to 0

INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
In [76]:
# Ploting the predictions
fig, ax = plt.subplots(figsize=(15,5))
chart = sns.lineplot(x='ds', y='y', data = modelo)
chart.set_title('Modelo')

fcast1_m.plot(ax=ax, color='red', marker="o", legend=True)
fcast2_m.plot(ax=ax, color='green', marker="o", legend=True)
fcast3_m.plot(ax=ax, color='orange', marker="o", legend=True)

test_m.plot(ax=ax, color='blue', marker="o", legend=True)
Out[76]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f7455bb7c8>

Heineken

In [77]:
heineken =c0[['date', 'HEINEKEN BEER']]
heineken.columns = ['ds', 'y'] #Need to rename the columns
heineken =heineken.iloc[54:]
In [78]:
#Split the data
train_m=heineken.iloc[:-12]#from 2019-07-01 to 2020-07-01
test_m=heineken.iloc[-12:] #last 30 values as testing values

# set the date into index
train_m.set_index('ds', inplace=True)
test_m.set_index('ds', inplace=True)

#Set the Index for 2 years

index_m = pd.date_range(train_m.index[-1], freq='MS', periods = 19) #MS= month start
index_m
Out[78]:
DatetimeIndex(['2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01',
               '2020-11-01', '2020-12-01', '2021-01-01', '2021-02-01',
               '2021-03-01', '2021-04-01', '2021-05-01', '2021-06-01',
               '2021-07-01', '2021-08-01', '2021-09-01', '2021-10-01',
               '2021-11-01', '2021-12-01', '2022-01-01'],
              dtype='datetime64[ns]', freq='MS')
In [79]:
stepwise_fit = auto_arima(heineken['y'], trace=True,
                          suppress_warnings=True)
Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.10 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=295.574, Time=0.00 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=296.590, Time=0.04 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=295.008, Time=0.04 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=344.965, Time=0.01 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.10 sec
 ARIMA(0,0,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.04 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.09 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=inf, Time=0.03 sec

Best model:  ARIMA(0,0,1)(0,0,0)[0] intercept
Total fit time: 0.458 seconds
In [80]:
####### Running ARIMA with best model
modelo_arima = ARIMA(train_m, order=(0,0,1)) # Best model
modelo_arima_fit = modelo_arima.fit(disp=-1)

# Saving ARIMA predictions
fcast1_m = modelo_arima_fit.forecast(19)[0]

# Passing the same index as the others
fcast1_m = pd.Series(fcast1_m, index=index_m)
fcast1_m = fcast1_m.rename("Arima") 

####### Auto ARIMA

# Running auto ARIMA 
auto_arima_modelo = pm.auto_arima(train_m, seasonal=False, m=19)

# make your forecasts
fcast2_m = auto_arima_modelo.predict(19) 
fcast2_m = pd.Series(fcast2_m, index=index_m)
fcast2_m = fcast2_m.rename("Auto Arima")


####### fbProphet

from pandas import to_datetime

# define the model
prophet_modelo = Prophet()
# fit the model
prophet_modelo.fit(heineken)

# define the period for which we want a prediction
df_m = pd.DataFrame(index_m)
df_m.columns = ['ds']
df_m['ds']= to_datetime(df_m['ds'])

# use the model to make a forecast
fcast3_m = prophet_modelo.predict(df_m)
fcast3_m = pd.Series(fcast3_m['yhat'].values, index=index_m)
fcast3_m = fcast3_m.rename("Prophet")
C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:162: ValueWarning:

No frequency information was provided, so inferred frequency MS will be used.

C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\pmdarima\arima\_validation.py:62: UserWarning:

m (19) set for non-seasonal fit. Setting to 0

INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:n_changepoints greater than number of observations. Using 19.
In [81]:
# Ploting the predictions
fig, ax = plt.subplots(figsize=(15,5))
chart = sns.lineplot(x='ds', y='y', data = heineken)
chart.set_title('Heineken')

fcast1_m.plot(ax=ax, color='red', marker="o", legend=True)
fcast2_m.plot(ax=ax, color='green', marker="o", legend=True)
fcast3_m.plot(ax=ax, color='orange', marker="o", legend=True)

test_m.plot(ax=ax, color='blue', marker="o", legend=True)
Out[81]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f744874348>

COORS LIGHT BEER

In [82]:
coors =c0[['date', 'COORS LIGHT BEER']]
coors.columns = ['ds', 'y'] #Need to rename the columns
coors =coors.iloc[54:]
In [83]:
#Split the data
train_m=coors.iloc[:-12]#from 2019-07-01 to 2020-07-01
test_m=coors.iloc[-12:] #last 30 values as testing values

# set the date into index
train_m.set_index('ds', inplace=True)
test_m.set_index('ds', inplace=True)

#Set the Index for 2 years

index_m = pd.date_range(train_m.index[-1], freq='MS', periods = 19) #MS= month start
index_m
Out[83]:
DatetimeIndex(['2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01',
               '2020-11-01', '2020-12-01', '2021-01-01', '2021-02-01',
               '2021-03-01', '2021-04-01', '2021-05-01', '2021-06-01',
               '2021-07-01', '2021-08-01', '2021-09-01', '2021-10-01',
               '2021-11-01', '2021-12-01', '2022-01-01'],
              dtype='datetime64[ns]', freq='MS')
In [84]:
stepwise_fit = auto_arima(coors['y'], trace=True,
                          suppress_warnings=True)
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.16 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=258.911, Time=0.01 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=258.622, Time=0.03 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=259.066, Time=0.04 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=258.122, Time=0.01 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=260.510, Time=0.07 sec

Best model:  ARIMA(0,1,0)(0,0,0)[0]          
Total fit time: 0.324 seconds
In [85]:
####### Running ARIMA with best model
modelo_arima = ARIMA(train_m, order=(0,1,0)) # Best model is ARIMA(0,0,1) for 2 years or so
modelo_arima_fit = modelo_arima.fit(disp=-1)

# Saving ARIMA predictions
fcast1_m = modelo_arima_fit.forecast(19)[0]

# Passing the same index as the others
fcast1_m = pd.Series(fcast1_m, index=index_m)
fcast1_m = fcast1_m.rename("Arima") 

####### Auto ARIMA

# Running auto ARIMA 
auto_arima_modelo = pm.auto_arima(train_m, seasonal=False, m=19)

# make your forecasts
fcast2_m = auto_arima_modelo.predict(19) 
fcast2_m = pd.Series(fcast2_m, index=index_m)
fcast2_m = fcast2_m.rename("Auto Arima")


####### fbProphet

from pandas import to_datetime

# define the model
prophet_modelo = Prophet()
# fit the model
prophet_modelo.fit(coors)

# define the period for which we want a prediction
df_m = pd.DataFrame(index_m)
df_m.columns = ['ds']
df_m['ds']= to_datetime(df_m['ds'])

# use the model to make a forecast
fcast3_m = prophet_modelo.predict(df_m)
fcast3_m = pd.Series(fcast3_m['yhat'].values, index=index_m)
fcast3_m = fcast3_m.rename("Prophet")
C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:162: ValueWarning:

No frequency information was provided, so inferred frequency MS will be used.

C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:162: ValueWarning:

No frequency information was provided, so inferred frequency MS will be used.

C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\pmdarima\arima\_validation.py:62: UserWarning:

m (19) set for non-seasonal fit. Setting to 0

INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:n_changepoints greater than number of observations. Using 19.
In [86]:
# Ploting the predictions
fig, ax = plt.subplots(figsize=(15,5))
chart = sns.lineplot(x='ds', y='y', data = coors)
chart.set_title('COORS LIGHT BEER')

fcast1_m.plot(ax=ax, color='red', marker="o", legend=True)
fcast2_m.plot(ax=ax, color='green', marker="o", legend=True)
fcast3_m.plot(ax=ax, color='orange', marker="o", legend=True)

test_m.plot(ax=ax, color='blue', marker="o", legend=True)
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f745559d08>

MILLER LITE BEER

In [87]:
miller =c0[['date', 'MILLER LITE BEER']]
miller.columns = ['ds', 'y'] #Need to rename the columns
miller =miller.iloc[54:]
In [88]:
#Split the data
train_m=miller.iloc[:-12]#from 2019-07-01 to 2020-07-01
test_m=miller.iloc[-12:] #last 30 values as testing values

# set the date into index
train_m.set_index('ds', inplace=True)
test_m.set_index('ds', inplace=True)

#Set the Index for 2 years

index_m = pd.date_range(train_m.index[-1], freq='MS', periods = 19) #MS= month start
index_m
Out[88]:
DatetimeIndex(['2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01',
               '2020-11-01', '2020-12-01', '2021-01-01', '2021-02-01',
               '2021-03-01', '2021-04-01', '2021-05-01', '2021-06-01',
               '2021-07-01', '2021-08-01', '2021-09-01', '2021-10-01',
               '2021-11-01', '2021-12-01', '2022-01-01'],
              dtype='datetime64[ns]', freq='MS')
In [89]:
stepwise_fit = auto_arima(miller['y'], trace=True,
                          suppress_warnings=True)
Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.21 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=276.181, Time=0.07 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=271.295, Time=0.03 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=274.559, Time=0.04 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=315.290, Time=0.00 sec
 ARIMA(2,0,0)(0,0,0)[0] intercept   : AIC=271.134, Time=0.09 sec
 ARIMA(3,0,0)(0,0,0)[0] intercept   : AIC=273.060, Time=0.14 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=272.017, Time=0.12 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=272.015, Time=0.11 sec
 ARIMA(3,0,1)(0,0,0)[0] intercept   : AIC=273.730, Time=0.20 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=272.448, Time=0.03 sec

Best model:  ARIMA(2,0,0)(0,0,0)[0] intercept
Total fit time: 1.062 seconds
In [90]:
####### Running ARIMA with best model
modelo_arima = ARIMA(train_m, order=(2,0,0)) # Best model is ARIMA(0,0,1) for 2 years or so
modelo_arima_fit = modelo_arima.fit(disp=-1)

# Saving ARIMA predictions
fcast1_m = modelo_arima_fit.forecast(19)[0]

# Passing the same index as the others
fcast1_m = pd.Series(fcast1_m, index=index_m)
fcast1_m = fcast1_m.rename("Arima") 

####### Auto ARIMA

# Running auto ARIMA 
auto_arima_modelo = pm.auto_arima(train_m, seasonal=False, m=19)

# make your forecasts
fcast2_m = auto_arima_modelo.predict(19) 
fcast2_m = pd.Series(fcast2_m, index=index_m)
fcast2_m = fcast2_m.rename("Auto Arima")


####### fbProphet

from pandas import to_datetime

# define the model
prophet_modelo = Prophet()
# fit the model
prophet_modelo.fit(miller)

# define the period for which we want a prediction
df_m = pd.DataFrame(index_m)
df_m.columns = ['ds']
df_m['ds']= to_datetime(df_m['ds'])

# use the model to make a forecast
fcast3_m = prophet_modelo.predict(df_m)
fcast3_m = pd.Series(fcast3_m['yhat'].values, index=index_m)
fcast3_m = fcast3_m.rename("Prophet")
C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:162: ValueWarning:

No frequency information was provided, so inferred frequency MS will be used.

C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\pmdarima\arima\_validation.py:62: UserWarning:

m (19) set for non-seasonal fit. Setting to 0

INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:n_changepoints greater than number of observations. Using 19.
In [91]:
# Ploting the predictions
fig, ax = plt.subplots(figsize=(15,5))
chart = sns.lineplot(x='ds', y='y', data = miller)
chart.set_title('MILLER LITE BEER')

fcast1_m.plot(ax=ax, color='red', marker="o", legend=True)
fcast2_m.plot(ax=ax, color='green', marker="o", legend=True)
fcast3_m.plot(ax=ax, color='orange', marker="o", legend=True)

test_m.plot(ax=ax, color='blue', marker="o", legend=True)
Out[91]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f74549eac8>

Seltzer

In [92]:
seltzer = df.loc[:, df.columns.isin(
    ['date','TRULY SPIKED AND SPARKLING ALCOHOLIC BEVERAGE', 'CORONA HARD SELTZER','HIGH NOON SUN SIPS',
    'SMIRNOFF HARD SELTZER','MIKES HARD LEMONADE SELTZER','WHITE CLAW HARD SELTZER ICED TEA', 'WHITE CLAW HARD SELTZER',
    'CAPE LINE BEVERAGES-ALCOHOLIC','COORS HARD SELTZER','VIZZY HARD SELTZER'])]
In [93]:
#Replace NaN values with 0
s0= seltzer.fillna(0)
In [94]:
s0['total'] = s0.sum(axis=1)
s0.tail()
C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:

Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.

Out[94]:
CAPE LINE BEVERAGES-ALCOHOLIC COORS HARD SELTZER CORONA HARD SELTZER HIGH NOON SUN SIPS MIKES HARD LEMONADE SELTZER SMIRNOFF HARD SELTZER TRULY SPIKED AND SPARKLING ALCOHOLIC BEVERAGE VIZZY HARD SELTZER WHITE CLAW HARD SELTZER WHITE CLAW HARD SELTZER ICED TEA date total
74 0.0 15.16 115.44 0.00 49.30 0.00 123.44 7.61 0.00 0.00 2021-03-01 310.95
75 0.0 0.00 202.29 0.00 45.63 0.00 84.12 61.74 60.14 0.00 2021-04-01 453.92
76 0.0 0.00 234.31 8.02 28.86 85.35 149.03 57.63 97.44 0.00 2021-05-01 660.64
77 0.0 0.00 233.88 7.64 25.51 121.32 123.18 97.53 35.38 28.91 2021-06-01 673.35
78 0.0 0.00 243.85 5.62 9.65 166.75 84.30 66.73 105.87 11.18 2021-07-01 693.95
In [95]:
#Index date column
s0['ds']=pd.DatetimeIndex(s0['date'])
In [96]:
s0 =s0[['ds','total']]
s0.columns = ['ds', 'y'] #Need to rename the columns
s0 =s0.iloc[54:]
s0
Out[96]:
ds y
54 2019-07-01 373.56
55 2019-08-01 243.64
56 2019-09-01 151.07
57 2019-10-01 76.63
58 2019-11-01 144.29
... ... ...
74 2021-03-01 310.95
75 2021-04-01 453.92
76 2021-05-01 660.64
77 2021-06-01 673.35
78 2021-07-01 693.95

25 rows × 2 columns

In [97]:
#Split the data
train=s0.iloc[:-12]#from 2019-07-01 to 2020-07-01
test=s0.iloc[-12:] #last 30 values as testing values


print('train shape', train.shape)
print('test shape', test.shape)

# set the date into index
train.set_index('ds', inplace=True)
test.set_index('ds', inplace=True)

#Set the Index for 2 years

index_years = pd.date_range(train.index[-1], freq='MS', periods = 19) #MS= month start

index_years
train shape (13, 2)
test shape (12, 2)
Out[97]:
DatetimeIndex(['2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01',
               '2020-11-01', '2020-12-01', '2021-01-01', '2021-02-01',
               '2021-03-01', '2021-04-01', '2021-05-01', '2021-06-01',
               '2021-07-01', '2021-08-01', '2021-09-01', '2021-10-01',
               '2021-11-01', '2021-12-01', '2022-01-01'],
              dtype='datetime64[ns]', freq='MS')
In [98]:
stepwise_fit = auto_arima(s0['y'], trace=True,
                          suppress_warnings=True)
Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.15 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=336.669, Time=0.00 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=325.661, Time=0.03 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=328.885, Time=0.04 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=373.305, Time=0.00 sec
 ARIMA(2,0,0)(0,0,0)[0] intercept   : AIC=326.590, Time=0.04 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=326.897, Time=0.06 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=328.561, Time=0.12 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=329.239, Time=0.01 sec

Best model:  ARIMA(1,0,0)(0,0,0)[0] intercept
Total fit time: 0.456 seconds
In [99]:
####### Running ARIMA with best model
modelo_arima = ARIMA(train, order=(1,0,0)) # Best model for 2 years or so
modelo_arima_fit = modelo_arima.fit(disp=-1)

# Saving ARIMA predictions
fcast1_m = modelo_arima_fit.forecast(19)[0]

# Passing the same index as the others
fcast1_m = pd.Series(fcast1_m, index=index_years)
fcast1_m = fcast1_m.rename("Arima") 

####### Auto ARIMA

# Running auto ARIMA 
auto_arima_modelo = pm.auto_arima(train, seasonal=False, m=19)

# make your forecasts
fcast2_m = auto_arima_modelo.predict(19) 
fcast2_m = pd.Series(fcast2_m, index=index_years)
fcast2_m = fcast2_m.rename("Auto Arima")


####### fbProphet

from pandas import to_datetime

# define the model
prophet_modelo = Prophet()
# fit the model
prophet_modelo.fit(s0)

# define the period for which we want a prediction
df_m = pd.DataFrame(index_years)
df_m.columns = ['ds']
df_m['ds']= to_datetime(df_m['ds'])

# use the model to make a forecast
fcast3_m = prophet_modelo.predict(df_m)
fcast3_m = pd.Series(fcast3_m['yhat'].values, index=index_years)
fcast3_m = fcast3_m.rename("Prophet")
C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:162: ValueWarning:

No frequency information was provided, so inferred frequency MS will be used.

C:\Users\Jenny Kim Kim\anaconda3\lib\site-packages\pmdarima\arima\_validation.py:62: UserWarning:

m (19) set for non-seasonal fit. Setting to 0

INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:n_changepoints greater than number of observations. Using 19.
In [100]:
# Ploting the predictions
fig, ax = plt.subplots(figsize=(15,5))
chart = sns.lineplot(x='ds', y='y', data = s0)
chart.set_title('Seltzer')
fcast1_m.plot(ax=ax, color='red', marker="o", legend=True)
fcast2_m.plot(ax=ax, color='green', marker="o", legend=True)
fcast3_m.plot(ax=ax, color='orange', marker="o", legend=True)

test.plot(ax=ax, color='blue', marker="o", legend=True)
Out[100]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f73f679ac8>